#### ======================================================================
#### Online appendix: aggregated country-level summaries with poststratified
#### weights (by gender and age), plus unweighted benchmarks
#### ----------------------------------------------------------------------
#### Purpose:
####   Create a single country-level dataset used throughout the appendix.
####   For each study country and subgroup, the script produces:
####     - Unweighted summaries (complete-case sample size N and moments)
####     - Poststratified-weighted summaries (weighted moments and N_Weighted)
####     - Bootstrapped 95% CIs for weighted preference gaps (deltas)
####
#### Outcomes (1-10 scales):
####   - China_current, China_desired
####   - USA_current,   USA_desired
####
#### Weighting approach:
####   - Poststratification targets are constructed from UN WPP 2024
####     single-year-of-age population tables, aggregated into age bands and
####     crossed with gender, within each ISO3 country.
####   - Weights are attached back to the anonymized microdata and then used
####     to compute country-level means and standard deviations via the
####     {survey} package.
####
#### Subgroups:
####   - all, men, women, and age groups (18-29, 30-49, 50-59, 60+)
####
#### Input:
####   - data/anonymized_survey.csv
####   - data/WPP2024_POP_F01_3_POPULATION_SINGLE_AGE_FEMALE_subset.xlsx
####   - data/WPP2024_POP_F01_2_POPULATION_SINGLE_AGE_MALE_subset.xlsx
####
#### Output:
####   - data/complete_aggregated_results.csv
#### ======================================================================

library(dplyr)
library(tidyr)
library(stringr)
library(readxl)
library(readr)
library(survey)
library(countrycode)
library(purrr)
library(tibble)

#### ----------------------------------------------------------------------
#### Study sample: countries and demographic bins
#### ----------------------------------------------------------------------

# Study countries (ISO3 codes). The 30 countries retained in the final
# analysis after dropping 7 with insufficient quota adherence.
iso_codes <- c(
  "ARG","BFA","BLR","BOL","BRA","CMR","COD","COL","EGY","GEO",
  "GHA","HRV","HTI","HUN","IDN","KEN","LBN","MDG","MEX","MLI",
  "NGA","PER","PHL","POL","ROU","SRB","TUN","TUR","URY","VEN"
)

# Age groups used for poststratification. (AgeGroup in the survey data is
# expected to match these labels.)
age_levels <- c("18-29", "30-49", "50-59", "60+")

# Helper: map single-year ages into the age groups used for weighting.
# Note: ages 0-17 are retained temporarily and excluded later.
categorize_age <- function(age) {
  case_when(
    age <= 17 ~ "0-17",
    age <= 29 ~ "18-29",
    age <= 49 ~ "30-49",
    age <= 59 ~ "50-59",
    TRUE      ~ "60+"
  )
}

# Helper: country labels for tables/plots. Uses countrycode() as the default,
# then applies a small set of display-name overrides used in the manuscript.
country_label_from_iso3 <- function(x) {
  out <- countrycode(x, origin = "iso3c", destination = "country.name")
  case_when(
    x == "COD" ~ "DRC",
    x == "PHL" ~ "Philippines",
    TRUE ~ out
  )
}

#### ----------------------------------------------------------------------
#### Load anonymized microdata
#### ----------------------------------------------------------------------
#### The microdata must include:
####   - ISO3_code (country identifier)
####   - Gender (Female/Male)
####   - AgeGroup (pre-binned into the age_levels above)
####   - outcome variables: China_current, China_desired, USA_current, USA_desired
US_China_survey <- read_csv("data/anonymized_survey.csv", show_col_types = FALSE) %>%
  mutate(
    ISO3_code = as.character(ISO3_code),
    Gender    = factor(Gender, levels = c("Female", "Male")),
    AgeGroup  = factor(AgeGroup, levels = age_levels)
  ) %>%
  filter(ISO3_code %in% iso_codes, !is.na(AgeGroup))

#### ----------------------------------------------------------------------
#### Load UN WPP 2024 single-year-of-age population totals
#### ----------------------------------------------------------------------
#### WPP files are read as text to avoid type issues; numeric conversion is
#### applied after reshaping.
read_wpp2024 <- function(path, gender_label) {
  read_excel(path, sheet = "WPP", skip = 0, col_types = "text") %>%
    mutate(
      ISO3_code = as.character(ISO3_code),
      Year      = suppressWarnings(as.integer(Year)),
      Gender    = gender_label
    )
}

wpp_female <- read_wpp2024(
  "data/WPP2024_POP_F01_3_POPULATION_SINGLE_AGE_FEMALE_subset.xlsx",
  "Female"
)

wpp_male <- read_wpp2024(
  "data/WPP2024_POP_F01_2_POPULATION_SINGLE_AGE_MALE_subset.xlsx",
  "Male"
)

# Combine male/female tables, restrict to study countries, and keep the most
# recent year available in the WPP files.
wpp <- bind_rows(wpp_female, wpp_male) %>%
  mutate(Year = suppressWarnings(as.integer(Year))) %>%
  filter(ISO3_code %in% iso_codes)

latest_year <- max(wpp$Year, na.rm = TRUE)

# Age columns are named with integers (and sometimes a trailing "+").
age_cols <- grep("^\\d+\\+?$", names(wpp), value = TRUE)

# Reshape to long format: one row per (country, gender, age) with population.
wpp_long <- wpp %>%
  filter(Year == latest_year) %>%
  pivot_longer(cols = all_of(age_cols), names_to = "Age", values_to = "Population") %>%
  mutate(
    Age        = as.integer(str_replace(Age, "\\+", "")),
    Population = suppressWarnings(as.numeric(Population))
  )

#### ----------------------------------------------------------------------
#### Construct poststratification totals: ISO3 x Gender x AgeGroup
#### ----------------------------------------------------------------------
#### WPP values are in thousands; they are converted to person-count totals.
pop_totals <- wpp_long %>%
  mutate(
    AgeGroup = categorize_age(Age),
    Gender   = factor(Gender, levels = c("Female", "Male")),
    AgeGroup = factor(AgeGroup, levels = age_levels)
  ) %>%
  filter(AgeGroup != "0-17") %>%
  group_by(ISO3_code, Gender, AgeGroup) %>%
  summarise(Freq = 1000 * sum(Population, na.rm = TRUE), .groups = "drop")

#### ----------------------------------------------------------------------
#### Poststratify the survey and attach weights to the microdata
#### ----------------------------------------------------------------------
#### The base design is an unclustered design (ids = ~1) with strata defined
#### by ISO3, gender, and age group. postStratify() adjusts the sample to match
#### population margins within these strata.
design_base <- svydesign(
  ids    = ~1,
  data   = US_China_survey,
  strata = ~ISO3_code + Gender + AgeGroup
)

design_poststratified <- postStratify(
  design_base,
  ~ISO3_code + Gender + AgeGroup,
  pop_totals,
  partial = TRUE
)

# Store the resulting weights in the microdata for reference and diagnostics.
US_China_survey$weights <- as.numeric(weights(design_poststratified))

#### ----------------------------------------------------------------------
#### Helper: weighted mean and SD for a single variable
#### ----------------------------------------------------------------------
#### Uses svymean() and svyvar() from {survey}. SD is returned as sqrt(variance).
svy_mean_sd <- function(design, var) {
  f <- as.formula(paste0("~", var))
  m <- svymean(f, design, na.rm = TRUE)
  v <- svyvar(f, design, na.rm = TRUE)
  tibble(
    mean = as.numeric(coef(m)),
    sd   = sqrt(as.numeric(coef(v)))
  )
}

#### ----------------------------------------------------------------------
#### Helper: bootstrapped 95% CI for the weighted delta (desired - current)
#### ----------------------------------------------------------------------
#### Resamples respondents within a country-level survey design, recomputes
#### the poststratified weighted delta on each draw, and returns the 2.5th
#### and 97.5th percentiles. This matches the CIs reported in the 1st
#### revision manuscript (Table A1 / now Table OA2).
bootstrap_delta_ci <- function(design_subset, var_current, var_desired,
                               n_boot = 2000, seed = 42) {
  set.seed(seed)
  d <- design_subset$variables
  w <- as.numeric(weights(design_subset))

  boot_deltas <- replicate(n_boot, {
    idx <- sample.int(nrow(d), replace = TRUE)
    w_b <- w[idx]
    cur_b <- d[[var_current]][idx]
    des_b <- d[[var_desired]][idx]
    ok <- !is.na(cur_b) & !is.na(des_b)
    if (sum(ok) < 2) return(NA_real_)
    weighted.mean(des_b[ok], w_b[ok]) - weighted.mean(cur_b[ok], w_b[ok])
  })

  quantile(boot_deltas, probs = c(0.025, 0.975), na.rm = TRUE)
}

#### ----------------------------------------------------------------------
#### Country-level summaries (weighted): complete cases on all four outcomes
#### ----------------------------------------------------------------------
#### For each country, restrict to complete cases on all outcomes so that
#### the derived deltas are comparable within-country.
process_country_weighted <- function(design_subset, iso3) {
  d_cc <- subset(
    design_subset,
    !is.na(China_current) & !is.na(China_desired) &
      !is.na(USA_current) & !is.na(USA_desired)
  )

  n_complete <- nrow(d_cc$variables)
  if (n_complete == 0) return(NULL)

  sc_cur <- svy_mean_sd(d_cc, "China_current")
  sc_des <- svy_mean_sd(d_cc, "China_desired")
  su_cur <- svy_mean_sd(d_cc, "USA_current")
  su_des <- svy_mean_sd(d_cc, "USA_desired")

  # Bootstrapped 95% CIs for the preference gap (delta).
  # Seed is derived from the ISO3 code to ensure reproducibility across
  # runs while varying by country.
  iso_seed <- sum(utf8ToInt(iso3))
  cn_ci <- bootstrap_delta_ci(d_cc, "China_current", "China_desired",
                              seed = iso_seed)
  us_ci <- bootstrap_delta_ci(d_cc, "USA_current", "USA_desired",
                              seed = iso_seed + 1000)

  tibble(
    ISO3_code  = iso3,
    Country    = country_label_from_iso3(iso3),
    N          = n_complete,
    N_Weighted = sum(weights(d_cc)),

    China_Mean_Current = sc_cur$mean,
    China_SD_Current   = sc_cur$sd,
    China_Mean_Desired = sc_des$mean,
    China_SD_Desired   = sc_des$sd,
    China_Delta        = sc_des$mean - sc_cur$mean,
    China_Delta_CI_Lo  = cn_ci[["2.5%"]],
    China_Delta_CI_Hi  = cn_ci[["97.5%"]],

    USA_Mean_Current = su_cur$mean,
    USA_SD_Current   = su_cur$sd,
    USA_Mean_Desired = su_des$mean,
    USA_SD_Desired   = su_des$sd,
    USA_Delta        = su_des$mean - su_cur$mean,
    USA_Delta_CI_Lo  = us_ci[["2.5%"]],
    USA_Delta_CI_Hi  = us_ci[["97.5%"]]
  )
}

#### ----------------------------------------------------------------------
#### Country-level summaries (unweighted): complete cases on all four outcomes
#### ----------------------------------------------------------------------
#### These summaries provide unweighted benchmarks and unweighted N. They are
#### also used for annotations in several appendix figures.
process_country_unweighted <- function(df_country, iso3) {
  df_cc <- df_country %>%
    filter(
      !is.na(China_current), !is.na(China_desired),
      !is.na(USA_current),   !is.na(USA_desired)
    )
  
  n_complete <- nrow(df_cc)
  if (n_complete == 0) return(NULL)
  
  tibble(
    ISO3_code  = iso3,
    Country    = country_label_from_iso3(iso3),
    N          = n_complete,
    N_Weighted = NA_real_,

    China_Mean_Current = mean(df_cc$China_current),
    China_SD_Current   = sd(df_cc$China_current),
    China_Mean_Desired = mean(df_cc$China_desired),
    China_SD_Desired   = sd(df_cc$China_desired),
    China_Delta        = mean(df_cc$China_desired - df_cc$China_current),
    China_Delta_CI_Lo  = NA_real_,
    China_Delta_CI_Hi  = NA_real_,

    USA_Mean_Current = mean(df_cc$USA_current),
    USA_SD_Current   = sd(df_cc$USA_current),
    USA_Mean_Desired = mean(df_cc$USA_desired),
    USA_SD_Desired   = sd(df_cc$USA_desired),
    USA_Delta        = mean(df_cc$USA_desired - df_cc$USA_current),
    USA_Delta_CI_Lo  = NA_real_,
    USA_Delta_CI_Hi  = NA_real_
  )
}

#### ----------------------------------------------------------------------
#### Subgroup definitions used in the appendix
#### ----------------------------------------------------------------------
#### Subgroups are defined as quoted expressions so the same condition can be
#### applied to both the microdata (dplyr::filter) and the survey design object.
subgroup_filters <- list(
  all        = quote(TRUE),
  men        = quote(Gender == "Male"),
  women      = quote(Gender == "Female"),
  age_18_29  = quote(AgeGroup == "18-29"),
  age_30_49  = quote(AgeGroup == "30-49"),
  age_50_59  = quote(AgeGroup == "50-59"),
  age_60plus = quote(AgeGroup == "60+")
)

#### ----------------------------------------------------------------------
#### Run weighted summaries by subgroup
#### ----------------------------------------------------------------------
#### The subset operation is applied to the poststratified survey design; then
#### country-level weighted moments are computed within each subgroup.
run_weighted <- function(condition, subgroup_name) {
  df_sg <- US_China_survey %>% filter(!!condition)
  
  idx <- eval(condition, design_poststratified$variables, parent.frame())
  if (length(idx) == 1) idx <- rep(idx, nrow(design_poststratified$variables))
  
  des_sg <- subset(design_poststratified, idx)
  iso_list <- sort(unique(df_sg$ISO3_code))
  
  map_dfr(iso_list, function(iso) {
    process_country_weighted(subset(des_sg, ISO3_code == iso), iso)
  }) %>%
    mutate(Method = "weighted", Subgroup = subgroup_name)
}

#### ----------------------------------------------------------------------
#### Run unweighted summaries by subgroup
#### ----------------------------------------------------------------------
#### Unweighted summaries are computed directly from the microdata subset.
run_unweighted <- function(condition, subgroup_name) {
  df_sg <- US_China_survey %>% filter(!!condition)
  iso_list <- sort(unique(df_sg$ISO3_code))
  
  map_dfr(iso_list, function(iso) {
    process_country_unweighted(df_sg %>% filter(ISO3_code == iso), iso)
  }) %>%
    mutate(Method = "unweighted", Subgroup = subgroup_name)
}

#### ----------------------------------------------------------------------
#### Compile final dataset and write to disk
#### ----------------------------------------------------------------------
#### The output stacks unweighted and weighted results, preserving a Method
#### indicator so downstream scripts can pull the appropriate quantities for
#### plotting and annotation.
results_weighted   <- imap_dfr(subgroup_filters, run_weighted)
results_unweighted <- imap_dfr(subgroup_filters, run_unweighted)

complete_results <- bind_rows(results_unweighted, results_weighted) %>%
  select(Method, Subgroup, Country, ISO3_code, N, N_Weighted, everything())

write_csv(complete_results, "data/complete_aggregated_results.csv")